
require(tidyverse)
require(estimatr)
require(Synth)
require(scpi)



####################################################################################################
####################################################################################################
#
#         Containing Large-Scale Criminal Violence through Internationalized Prosecution: 
#              How the Collaboration between the CICIG and Guatemala’s Law Enforcement
#                  Contributed to a Sustained Reduction in the Murder Rate
#         
#                                         - prediction intervals -
#
#                             Guillermo Trejo                   Camilo Nieto-Matiz
#                               gtrejo@nd.edu                 camilo.nieto-matiz@utsa.edu
#                 
####################################################################################################
####################################################################################################


##########################################################################################
#                    Code below is based on:
#   Matias D. Cattaneo, Yingjie Feng, Filippo Palomba and Rocio Titiunik. 2022.
# "scpi: Uncertainty Quantification for Synthetic Control Estimators", Working paper.
#        https://github.com/nppackages/scpi/blob/main/R/scpi_illustration.R
##########################################################################################


# load data
load("guatemala data.rda")




################################################################
#            Homicide rates - prediction intervals
################################################################

# select countries and variables
dath <- dat %>% select("country", "year", "homrates_oms", "loggdp", "logimr", "growth", "ginimkt", "schooling2") %>% 
  filter(country %in% c("Guatemala","Honduras", "Panama",  "Venezuela", "Costa Rica",  
                        "Dominican Republic", "Nicaragua", "Bolivia"))


### Set options for data preparation
id.var      <- "country"                             # ID variable
time.var    <- "year"                                # Time variable
period.pre  <- seq(from = 2002, to = 2007, by = 1)   # Pre-treatment period
period.post <- (2008:2019)                           # Post-treatment period
unit.tr     <- "Guatemala"                        # Treated unit (in terms of id.var)
unit.co     <- unique(dath$country)[-4]              # Donors pool
outcome.var.h <- "homrates_oms"                                 # Outcome variable
cov.adj     <- NULL                                  # Covariates for adjustment
features.h    <- c("homrates_oms", "loggdp", "logimr", "growth", "ginimkt", "schooling2")                              # No features other than outcome
constant    <- T                                     # No constant term
report.missing <- F                                  # To check where missing values are
cointegrated.data <- T                               # Belief that the data are cointegrated


### Data preparation
df  <-   scdata(df = dath, id.var = id.var, time.var = time.var, outcome.var = outcome.var.h,
                period.pre = period.pre, period.post = period.post,
                unit.tr = unit.tr, unit.co = unit.co, cov.adj = cov.adj, features = features.h,
                constant = constant,  report.missing = report.missing, cointegrated.data = cointegrated.data)



## Set options for inference
u.alpha  <- 0.05                         # Confidence level (in-sample uncertainty)
e.alpha  <- 0.05                         # Confidence level (out-of-sample uncertainty)
rho      <- NULL                         # Regularization parameter (if NULL it is estimated)
rho.max  <- 1                            # Maximum value attainable by rho
sims     <- 200                          # Number of simulations
V        <- NULL                         # Weighting matrix (if NULL it is the identity matrix)
u.order  <- 1                            # Degree of polynomial in B and C when modelling u
u.lags   <- 0                            # Lags of B to be used when modelling u
u.sigma  <- "HC1"                        # Estimator for the variance-covariance of u
u.missp  <- T                            # If TRUE then the model is treated as misspecified
e.lags   <- 0                            # Degree of polynomial in B and C when modelling e
e.order  <- 1                            # Lags of B to be used when modelling e
e.method <- "all"                       # Estimation method for out-of-sample uncertainty
cores    <- 2                            # Number of cores to be used by scpi
w.constr <- list(name = "simplex")       # Simplex-type constraint set

set.seed(8894)
result  <- scpi(data = df,u.order = u.order, u.lags = u.lags, u.sigma = u.sigma, 
                u.missp = u.missp, sims = sims, e.order = e.order, e.lags = e.lags,
                e.method = e.method, cores = cores, w.constr = w.constr, u.alpha = u.alpha,
                e.alpha = e.alpha, V = V, rho = rho, rho.max = rho.max) 

### SC - plot results
scplot(result = result, plot.range = (2002:2019),
       label.xy = list(x.lab = "Year", y.lab = "Homicide rates"),
       x.ticks = NULL, e.out = T, event.label = NULL)



### SC - manually reproduce plot
# Store data on treated unit, synthetic unit, and prediction bars

y.fit <- rbind(result$est.results$Y.pre.fit, result$est.results$Y.post.fit)
y.act <- rbind(result$data$Y.pre, result$data$Y.post)

sc.l.qreg  <- result$inference.results$CI.all.qreg[, 1, drop = F]
sc.r.qreg  <- result$inference.results$CI.all.qreg[, 2, drop = F]

sc.l.gau  <- result$inference.results$CI.all.gaussian[, 1, drop = F]
sc.r.gau  <- result$inference.results$CI.all.gaussian[, 2, drop = F]

sc.l.ls  <- result$inference.results$CI.all.ls[, 1, drop = F]
sc.r.ls  <- result$inference.results$CI.all.ls[, 2, drop = F]



# Store other specifics
period.pre  <- result$data$specs$period.pre
period.post <- result$data$specs$period.post
T0          <- period.pre[length(period.pre)] # intercept
plot.range  <- c(period.pre, period.post)


# Actual data
h.dat    <- data.frame(t     = c(period.pre, period.post),
                       Y.act = c(y.act),
                       sname = "Treated")

# Fill with NAs Y.fit and confidence bounds where missing
Y.fit.na  <- matrix(NA, nrow = length(c(period.pre, period.post)))
sc.l.qreg.na   <- matrix(NA, nrow = length(c(period.pre, period.post)))
sc.r.qreg.na   <- matrix(NA, nrow = length(c(period.pre, period.post)))
sc.l.gau.na   <- matrix(NA, nrow = length(c(period.pre, period.post)))
sc.r.gau.na   <- matrix(NA, nrow = length(c(period.pre, period.post)))
sc.l.ls.na   <- matrix(NA, nrow = length(c(period.pre, period.post)))
sc.r.ls.na   <- matrix(NA, nrow = length(c(period.pre, period.post)))

not.missing.plot <- c(period.pre,period.post) %in% rownames(y.fit)
not.missing.ci.qreg   <- c(period.pre,period.post) %in% rownames(sc.l.qreg)
not.missing.ci.ls   <- c(period.pre,period.post) %in% rownames(sc.l.ls)
not.missing.ci.gau   <- c(period.pre,period.post) %in% rownames(sc.l.gau)


Y.fit.na[not.missing.plot, 1] <- y.fit
sc.l.qreg.na[not.missing.ci.qreg, 1]    <- sc.l.qreg
sc.r.qreg.na[not.missing.ci.qreg, 1]    <- sc.r.qreg
sc.l.gau.na[not.missing.ci.gau, 1]    <- sc.l.gau
sc.r.gau.na[not.missing.ci.gau, 1]    <- sc.r.gau
sc.l.ls.na[not.missing.ci.ls, 1]    <- sc.l.ls
sc.r.ls.na[not.missing.ci.ls, 1]    <- sc.r.ls


# Synthetic unit data
h.dat.sc <- data.frame(t        = c(period.pre, period.post),
                       Y.sc     = Y.fit.na,
                       lb.qreg       = c(sc.l.qreg.na), ub.qreg = c(sc.r.qreg.na),
                       lb.gau       = c(sc.l.gau.na), ub.gau = c(sc.r.gau.na),
                       lb.ls       = c(sc.l.ls.na), ub.ls = c(sc.r.ls.na),
                       sname    = "SC Unit")

# merge
h.dat.plot    <- subset(h.dat,    t %in% plot.range)
h.dat.sc.plot <- subset(h.dat.sc, t %in% plot.range)
h.plotdf <- dplyr::left_join(h.dat.plot, h.dat.sc.plot, by = 't')


### plot (quantile regression)
ggplot(h.plotdf) +
  geom_line(aes(x=t, y=Y.sc, colour="Synthetic Guatemala",linetype="Synthetic Guatemala"),  size=1)+
  geom_point( aes(x=t, y=Y.sc, colour="Synthetic Guatemala"), shape=1) +
  geom_line(aes(x=t, y=Y.act, colour="Guatemala",linetype="Guatemala"),  size=1) +
  geom_point(aes(x=t, y=Y.act, colour="Guatemala"), shape=19) +
  geom_errorbar(aes(x=t, ymin=lb.qreg, ymax=ub.qreg), 
                col="blue", width=0.4, linetype=1, size=.4) + ylim(10,80)+  
  theme_bw() + scale_x_continuous(expand = c(0, 0), breaks=seq(2002,2019,2)) +
  theme(legend.position = c(0.19,0.15), 
        legend.background = element_rect(fill = "transparent"))+
  labs(x="Year", y="Homicide rates", colour="") +
  scale_color_manual(name="", values = c("black", "blue"))+
  scale_linetype_manual(name="", values = c(1,5))+
  geom_vline(xintercept=2008, color="red", linetype=2)



### plot (location-scale model)
ggplot(h.plotdf) +
  geom_line(aes(x=t, y=Y.sc, colour="Synthetic Guatemala",linetype="Synthetic Guatemala"),  size=1)+
  geom_point( aes(x=t, y=Y.sc, colour="Synthetic Guatemala"), shape=1) +
  geom_line(aes(x=t, y=Y.act, colour="Guatemala",linetype="Guatemala"),  size=1) +
  geom_point(aes(x=t, y=Y.act, colour="Guatemala"), shape=19) +
  geom_errorbar(aes(x=t, ymin=lb.qreg, ymax=ub.qreg), 
                col="blue", width=0.4, linetype=1, size=.4) + ylim(10,80)+  
  theme_bw() + scale_x_continuous(expand = c(0, 0), breaks=seq(2002,2019,2)) +
  theme(legend.position = c(0.19,0.15), 
        legend.background = element_rect(fill = "transparent"))+
  labs(x="Year", y="Homicide rates", colour="") +
  scale_color_manual(name="", values = c("black", "blue"))+
  scale_linetype_manual(name="", values = c(1,5)) +
  geom_vline(xintercept=2008, color="red", linetype=2)



### plot (sub-gaussian bounds)
ggplot(h.plotdf) +
  geom_line(aes(x=t, y=Y.sc, colour="Synthetic Guatemala",linetype="Synthetic Guatemala"),  size=1)+
  geom_point( aes(x=t, y=Y.sc, colour="Synthetic Guatemala"), shape=1) +
  geom_line(aes(x=t, y=Y.act, colour="Guatemala",linetype="Guatemala"),  size=1) +
  geom_point(aes(x=t, y=Y.act, colour="Guatemala"), shape=19) +
  geom_errorbar(aes(x=t, ymin=lb.qreg, ymax=ub.qreg), 
                col="blue", width=0.4, linetype=1, size=.4) + ylim(10,80)+  
  theme_bw() + scale_x_continuous(expand = c(0, 0), breaks=seq(2002,2019,2)) +
  theme(legend.position = c(0.19,0.15), 
        legend.background = element_rect(fill = "transparent"))+
  labs(x="Year", y="Homicide rates", colour="") +
  scale_color_manual(name="", values = c("black", "blue"))+
  scale_linetype_manual(name="", values = c(1,5))+
  geom_vline(xintercept=2008, color="red", linetype=2)







################################################################
#            Human rights - prediction intervals
################################################################


dathr <- dat %>% select("country", "year", "theta_mean_rev", "logpop", "growth", "ginimkt", "loggdp", "v2x_libdem") %>% 
  filter(country %in% c("Guatemala","Honduras", "Panama",  "Venezuela", "Costa Rica",  
                        "Dominican Republic", "Nicaragua"))



### Set options for data preparation
id.var      <- "country"                             # ID variable
time.var    <- "year"                                # Time variable
period.pre  <- seq(from = 2002, to = 2007, by = 1)   # Pre-treatment period
period.post <- (2008:2019)                           # Post-treatment period
unit.tr     <- "Guatemala"                           # Treated unit (in terms of id.var)
unit.co.hr     <- unique(dathr$country)[-3]             # Donors pool
outcome.var.hr <- "theta_mean_rev"                      # Outcome variable
cov.adj     <- NULL                                  # Covariates for adjustment
features.hr    <- c("theta_mean_rev", "logpop", "ginimkt", "loggdp", "v2x_libdem", "growth")                              # No features other than outcome
constant    <- T                                     # No constant term
report.missing <- F                                  # To check where missing values are
cointegrated.data <- T                               # Belief that the data are cointegrated



####################################
### Data preparation
df.hr  <-   scdata(df = dathr, id.var = id.var, time.var = time.var, outcome.var = outcome.var.hr,
                   period.pre = period.pre, period.post = period.post,
                   unit.tr = unit.tr, unit.co = unit.co.hr, cov.adj = cov.adj, features = features.hr,
                   constant = constant,  report.missing = report.missing, cointegrated.data = cointegrated.data)


####################################
## Set options for inference
u.alpha  <- 0.05                         # Confidence level (in-sample uncertainty)
e.alpha  <- 0.05                         # Confidence level (out-of-sample uncertainty)
rho      <- NULL                         # Regularization parameter (if NULL it is estimated)
rho.max  <- 1                            # Maximum value attainable by rho
sims     <- 200                          # Number of simulations
V        <- NULL                         # Weighting matrix (if NULL it is the identity matrix)
u.order  <- 1                            # Degree of polynomial in B and C when modelling u
u.lags   <- 0                            # Lags of B to be used when modelling u
u.sigma  <- "HC1"                        # Estimator for the variance-covariance of u
u.missp  <- T                            # If TRUE then the model is treated as misspecified
e.lags   <- 0                            # Degree of polynomial in B and C when modelling e
e.order  <- 1                            # Lags of B to be used when modelling e
e.method <- "all"                       # Estimation method for out-of-sample uncertainty
cores    <- 2                            # Number of cores to be used by scpi
w.constr <- list(name = "simplex")       # Simplex-type constraint set

set.seed(8894)
result.hr  <- scpi(data = df.hr,u.order = u.order, u.lags = u.lags, u.sigma = u.sigma, 
                   u.missp = u.missp, sims = sims, e.order = e.order, e.lags = e.lags,
                   e.method = e.method, cores = cores, w.constr = w.constr, u.alpha = u.alpha,
                   e.alpha = e.alpha, V = V, rho = rho, rho.max = rho.max) 


### SC plot results
scplot(result = result.hr, plot.range = (2002:2019),
       label.xy = list(x.lab = "Year", y.lab = "Human rights violations"),
       x.ticks = NULL, e.out = T, event.label = NULL)



### SC - manually reproduce plot
# Store data on treated unit, synthetic unit, and prediction bars

y.fit.hr <- rbind(result.hr$est.results$Y.pre.fit, result.hr$est.results$Y.post.fit)
y.act.hr <- rbind(result.hr$data$Y.pre, result.hr$data$Y.post)

sc.l.qreg.hr  <- result.hr$inference.results$CI.all.qreg[, 1, drop = F]
sc.r.qreg.hr  <- result.hr$inference.results$CI.all.qreg[, 2, drop = F]

sc.l.gau.hr  <- result.hr$inference.results$CI.all.gaussian[, 1, drop = F]
sc.r.gau.hr  <- result.hr$inference.results$CI.all.gaussian[, 2, drop = F]

sc.l.ls.hr  <- result.hr$inference.results$CI.all.ls[, 1, drop = F]
sc.r.ls.hr  <- result.hr$inference.results$CI.all.ls[, 2, drop = F]



# Store other specifics
period.pre.hr  <- result.hr$data$specs$period.pre
period.post.hr <- result.hr$data$specs$period.post
T0.hr          <- period.pre[length(period.pre)] # intercept
plot.range.hr  <- c(period.pre, period.post)


# Actual data
hr.dat    <- data.frame(t     = c(period.pre.hr, period.post.hr),
                        Y.act = c(y.act.hr),
                        sname = "Treated")

# Fill with NAs Y.fit and confidence bounds where missing
Y.fit.na.hr  <- matrix(NA, nrow = length(c(period.pre.hr, period.post.hr)))
sc.l.qreg.na.hr   <- matrix(NA, nrow = length(c(period.pre.hr, period.post.hr)))
sc.r.qreg.na.hr   <- matrix(NA, nrow = length(c(period.pre.hr, period.post.hr)))
sc.l.gau.na.hr   <- matrix(NA, nrow = length(c(period.pre.hr, period.post.hr)))
sc.r.gau.na.hr   <- matrix(NA, nrow = length(c(period.pre.hr, period.post.hr)))
sc.l.ls.na.hr   <- matrix(NA, nrow = length(c(period.pre.hr, period.post.hr)))
sc.r.ls.na.hr   <- matrix(NA, nrow = length(c(period.pre.hr, period.post.hr)))

not.missing.plot.hr <- c(period.pre.hr,period.post.hr) %in% rownames(y.fit.hr)
not.missing.ci.qreg.hr   <- c(period.pre.hr,period.post.hr) %in% rownames(sc.l.qreg.hr)
not.missing.ci.ls.hr   <- c(period.pre.hr,period.post.hr) %in% rownames(sc.l.ls.hr)
not.missing.ci.gau.hr   <- c(period.pre.hr,period.post.hr) %in% rownames(sc.l.gau.hr)


Y.fit.na.hr[not.missing.plot.hr, 1] <- y.fit.hr
sc.l.qreg.na.hr[not.missing.ci.qreg.hr, 1]    <- sc.l.qreg.hr
sc.r.qreg.na.hr[not.missing.ci.qreg.hr, 1]    <- sc.r.qreg.hr
sc.l.gau.na.hr[not.missing.ci.gau.hr, 1]    <- sc.l.gau.hr
sc.r.gau.na.hr[not.missing.ci.gau.hr, 1]    <- sc.r.gau.hr
sc.l.ls.na.hr[not.missing.ci.ls.hr, 1]    <- sc.l.ls.hr
sc.r.ls.na.hr[not.missing.ci.ls.hr, 1]    <- sc.r.ls.hr


# Synthetic unit data
hr.dat.sc <- data.frame(t        = c(period.pre.hr, period.post.hr),
                        Y.sc     = Y.fit.na.hr,
                        lb.qreg       = c(sc.l.qreg.na.hr), ub.qreg = c(sc.r.qreg.na.hr),
                        lb.gau       = c(sc.l.gau.na.hr), ub.gau = c(sc.r.gau.na.hr),
                        lb.ls       = c(sc.l.ls.na.hr), ub.ls = c(sc.r.ls.na.hr),
                        sname    = "SC Unit")

# merge
hr.dat.plot    <- subset(hr.dat,    t %in% plot.range.hr)
hr.dat.sc.plot <- subset(hr.dat.sc, t %in% plot.range.hr)
plotdf.hr <- dplyr::left_join(hr.dat.plot, hr.dat.sc.plot, by = 't')


### plot (quantile regression)
ggplot(plotdf.hr) +
  geom_line(aes(x=t, y=Y.sc, colour="Synthetic Guatemala",linetype="Synthetic Guatemala"),  size=1)+
  geom_point( aes(x=t, y=Y.sc, colour="Synthetic Guatemala"), shape=1) +
  geom_line(aes(x=t, y=Y.act, colour="Guatemala",linetype="Guatemala"),  size=1) +
  geom_point(aes(x=t, y=Y.act, colour="Guatemala"), shape=19) +
  geom_errorbar(aes(x=t, ymin=lb.qreg, ymax=ub.qreg), 
                col="blue", width=0.4, linetype=1, size=.4) +  ylim(-1.5,2)+
  theme_bw() + scale_x_continuous(expand = c(0, 0), breaks=seq(2002,2019,2)) +
  theme(legend.position = c(0.19,0.15), 
        legend.background = element_rect(fill = "transparent"))+
  labs(x="Year", y="Human rights violations", colour="") +
  scale_color_manual(name="", values = c("black", "blue"))+
  scale_linetype_manual(name="", values = c(1,5))+
  geom_vline(xintercept=2008, color="red", linetype=2)


### plot (location-scale model)
ggplot(plotdf.hr) +
  geom_line(aes(x=t, y=Y.sc, colour="Synthetic Guatemala",linetype="Synthetic Guatemala"),  size=1)+
  geom_point( aes(x=t, y=Y.sc, colour="Synthetic Guatemala"), shape=1) +
  geom_line(aes(x=t, y=Y.act, colour="Guatemala",linetype="Guatemala"),  size=1) +
  geom_point(aes(x=t, y=Y.act, colour="Guatemala"), shape=19) +
  geom_errorbar(aes(x=t, ymin=lb.ls, ymax=ub.ls), 
                col="blue", width=0.4, linetype=1, size=.4) +  ylim(-1.5,2)+
  theme_bw() + scale_x_continuous(expand = c(0, 0), breaks=seq(2002,2019,2)) +
  theme(legend.position = c(0.19,0.15), 
        legend.background = element_rect(fill = "transparent"))+
  labs(x="Year", y="Human rights violations", colour="") +
  scale_color_manual(name="", values = c("black", "blue"))+
  scale_linetype_manual(name="", values = c(1,5))+
  geom_vline(xintercept=2008, color="red", linetype=2)


### plot (sub-gaussian bounds)
ggplot(plotdf.hr) +
  geom_line(aes(x=t, y=Y.sc, colour="Synthetic Guatemala",linetype="Synthetic Guatemala"),  size=1)+
  geom_point( aes(x=t, y=Y.sc, colour="Synthetic Guatemala"), shape=1) +
  geom_line(aes(x=t, y=Y.act, colour="Guatemala",linetype="Guatemala"),  size=1) +
  geom_point(aes(x=t, y=Y.act, colour="Guatemala"), shape=19) +
  geom_errorbar(aes(x=t, ymin=lb.gau, ymax=ub.gau), 
                col="blue", width=0.4, linetype=1, size=.4) +  ylim(-1.5,2)+
  theme_bw() + scale_x_continuous(expand = c(0, 0), breaks=seq(2002,2019,2)) +
  theme(legend.position = c(0.19,0.15), 
        legend.background = element_rect(fill = "transparent"))+
  labs(x="Year", y="Human rights violations", colour="") +
  scale_color_manual(name="", values = c("black", "blue"))+
  scale_linetype_manual(name="", values = c(1,5))+
  geom_vline(xintercept=2008, color="red", linetype=2)
